{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# To run benchmark script, you will need to install XGBoost \n",
    "# (pip install XGBoost)\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "\n",
    "def load_breast_data():\n",
    "    breast = load_breast_cancer()\n",
    "    feature_names = list(breast.feature_names)\n",
    "    X, y = pd.DataFrame(breast.data, columns=feature_names), breast.target\n",
    "    dataset = {\n",
    "        'problem': 'classification',\n",
    "        'full': {\n",
    "            'X': X,\n",
    "            'y': y,\n",
    "        },\n",
    "    }\n",
    "    return dataset\n",
    "\n",
    "\n",
    "def load_adult_data():\n",
    "    df = pd.read_csv(\n",
    "        \"https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data\",\n",
    "        header=None)\n",
    "    df.columns = [\n",
    "        \"Age\", \"WorkClass\", \"fnlwgt\", \"Education\", \"EducationNum\",\n",
    "        \"MaritalStatus\", \"Occupation\", \"Relationship\", \"Race\", \"Gender\",\n",
    "        \"CapitalGain\", \"CapitalLoss\", \"HoursPerWeek\", \"NativeCountry\", \"Income\"\n",
    "    ]\n",
    "    train_cols = df.columns[0:-1]\n",
    "    label = df.columns[-1]\n",
    "    X_df = df[train_cols]\n",
    "    y_df = df[label]\n",
    "\n",
    "    dataset = {\n",
    "        'problem': 'classification',\n",
    "        'full': {\n",
    "            'X': X_df,\n",
    "            'y': y_df,\n",
    "        },\n",
    "    }\n",
    "\n",
    "    return dataset\n",
    "\n",
    "def load_heart_data():\n",
    "    # https://www.kaggle.com/sonumj/heart-disease-dataset-from-uci\n",
    "    df = pd.read_csv(r'D:\\datasets\\heart-disease-uci\\heart.csv')\n",
    "    train_cols = df.columns[0:-1]\n",
    "    label = df.columns[-1]\n",
    "    X_df = df[train_cols]\n",
    "    y_df = df[label]\n",
    "    dataset = {\n",
    "        'problem': 'classification',\n",
    "        'full': {\n",
    "            'X': X_df,\n",
    "            'y': y_df,\n",
    "        },\n",
    "    }\n",
    "    \n",
    "    return dataset\n",
    "\n",
    "\n",
    "def load_credit_data():\n",
    "    # https://www.kaggle.com/mlg-ulb/creditcardfraud\n",
    "    df = pd.read_csv(r'D:\\datasets\\creditcardfraud\\creditcard.csv')\n",
    "    train_cols = df.columns[0:-1]\n",
    "    label = df.columns[-1]\n",
    "    X_df = df[train_cols]\n",
    "    y_df = df[label]\n",
    "    dataset = {\n",
    "        'problem': 'classification',\n",
    "        'full': {\n",
    "            'X': X_df,\n",
    "            'y': y_df,\n",
    "        },\n",
    "    }\n",
    "    \n",
    "    return dataset\n",
    "\n",
    "\n",
    "def load_telco_churn_data():\n",
    "    # https://www.kaggle.com/blastchar/telco-customer-churn/downloads/WA_Fn-UseC_-Telco-Customer-Churn.csv/1\n",
    "    df = pd.read_csv(r'D:\\datasets\\telco-customer-churn\\WA_Fn-UseC_-Telco-Customer-Churn.csv')\n",
    "    train_cols = df.columns[1:-1] # First column is an ID\n",
    "    label = df.columns[-1]\n",
    "    X_df = df[train_cols]\n",
    "    y_df = df[label] # 'Yes, No'\n",
    "    dataset = {\n",
    "        'problem': 'classification',\n",
    "        'full': {\n",
    "            'X': X_df,\n",
    "            'y': y_df,\n",
    "        },\n",
    "    }\n",
    "    \n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.model_selection import StratifiedShuffleSplit, cross_validate\n",
    "\n",
    "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "from sklearn.linear_model import SGDClassifier, LogisticRegression\n",
    "\n",
    "from interpret.glassbox import ExplainableBoostingClassifier\n",
    "\n",
    "\n",
    "def format_n(x):\n",
    "    return \"{0:.3f}\".format(x)\n",
    "\n",
    "def process_model(clf, name, X, y, n_splits=3):\n",
    "    # Evaluate model\n",
    "    ss = StratifiedShuffleSplit(n_splits=n_splits, test_size=0.25, random_state=1337)\n",
    "    scores = cross_validate(\n",
    "        clf, X, y, scoring='roc_auc', cv=ss,\n",
    "        n_jobs=None, return_estimator=True\n",
    "    )\n",
    "\n",
    "    record = dict()\n",
    "    record['model_name'] = name\n",
    "    record['fit_time_mean'] = format_n(np.mean(scores['fit_time']))\n",
    "    record['fit_time_std'] = format_n(np.std(scores['fit_time']))\n",
    "    record['test_score_mean'] = format_n(np.mean(scores['test_score']))\n",
    "    record['test_score_std'] = format_n(np.std(scores['test_score']))\n",
    "\n",
    "    return record\n",
    "\n",
    "\n",
    "\n",
    "def benchmark_models(dataset_name, X, y, ct=None, n_splits=3, random_state=1337):\n",
    "    if ct is None:\n",
    "        is_cat = np.array([dt.kind == 'O' for dt in X.dtypes])\n",
    "        cat_cols = X.columns.values[is_cat]\n",
    "        num_cols = X.columns.values[~is_cat]\n",
    "\n",
    "        cat_ohe_step = ('ohe', OneHotEncoder(sparse=False,\n",
    "                                             handle_unknown='ignore'))\n",
    "\n",
    "        cat_pipe = Pipeline([cat_ohe_step])\n",
    "        num_pipe = Pipeline([('identity', FunctionTransformer())])\n",
    "        transformers = [\n",
    "            ('cat', cat_pipe, cat_cols),\n",
    "            ('num', num_pipe, num_cols)\n",
    "        ]\n",
    "        ct = ColumnTransformer(transformers=transformers)\n",
    "\n",
    "    records = []\n",
    "\n",
    "    summary_record = {}\n",
    "    summary_record['dataset_name'] = dataset_name\n",
    "    print()\n",
    "    print('-' * 78)\n",
    "    print(dataset_name)\n",
    "    print('-' * 78)\n",
    "    print(summary_record)\n",
    "    print()\n",
    "\n",
    "    pipe = Pipeline([\n",
    "        ('ct', ct),\n",
    "        ('std', StandardScaler()),\n",
    "        ('linear-sgd', SGDClassifier(random_state=random_state)),\n",
    "    ])\n",
    "    record = process_model(pipe, 'linear-sgd', X, y, n_splits=n_splits)\n",
    "    print(record)\n",
    "    record.update(summary_record)\n",
    "    records.append(record)\n",
    "\n",
    "    pipe = Pipeline([\n",
    "        ('ct', ct),\n",
    "        ('std', StandardScaler()),\n",
    "        ('lr', LogisticRegression(random_state=random_state)),\n",
    "    ])\n",
    "    record = process_model(pipe, 'lr', X, y, n_splits=n_splits)\n",
    "    print(record)\n",
    "    record.update(summary_record)\n",
    "    records.append(record)\n",
    "\n",
    "    pipe = Pipeline([\n",
    "        ('ct', ct),\n",
    "        # n_estimators updated from 10 to 100 due to sci-kit defaults changing in future versions\n",
    "        ('rf-100', RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=random_state)),\n",
    "    ])\n",
    "    record = process_model(pipe, 'rf-100', X, y, n_splits=n_splits)\n",
    "    print(record)\n",
    "    record.update(summary_record)\n",
    "    records.append(record)\n",
    "    \n",
    "    pipe = Pipeline([\n",
    "        ('ct', ct),\n",
    "        ('xgb', XGBClassifier(random_state=random_state)),\n",
    "    ])\n",
    "    record = process_model(pipe, 'xgb', X, y, n_splits=n_splits)\n",
    "    print(record)\n",
    "    record.update(summary_record)\n",
    "    records.append(record)\n",
    "\n",
    "    # No pipeline needed due to EBM handling string datatypes\n",
    "    ebm_main = ExplainableBoostingClassifier(n_jobs=-1, interactions=0, random_state=random_state)\n",
    "    record = process_model(ebm_main, 'ebm main', X, y, n_splits=n_splits)\n",
    "    print(record)\n",
    "    record.update(summary_record)\n",
    "    records.append(record)\n",
    "\n",
    "    return records"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = []\n",
    "n_splits = 3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------------------------------------------------------------------------\n",
      "heart\n",
      "------------------------------------------------------------------------------\n",
      "{'dataset_name': 'heart'}\n",
      "\n",
      "{'model_name': 'linear-sgd', 'fit_time_mean': '0.015', 'fit_time_std': '0.003', 'test_score_mean': '0.823', 'test_score_std': '0.013'}\n",
      "{'model_name': 'lr', 'fit_time_mean': '0.013', 'fit_time_std': '0.000', 'test_score_mean': '0.895', 'test_score_std': '0.030'}\n",
      "{'model_name': 'rf-100', 'fit_time_mean': '1.564', 'fit_time_std': '1.628', 'test_score_mean': '0.890', 'test_score_std': '0.008'}\n",
      "{'model_name': 'xgb', 'fit_time_mean': '0.411', 'fit_time_std': '0.455', 'test_score_mean': '0.870', 'test_score_std': '0.014'}\n",
      "{'model_name': 'ebm main', 'fit_time_mean': '9.765', 'fit_time_std': '1.174', 'test_score_mean': '0.916', 'test_score_std': '0.005'}\n",
      "{'model_name': 'ebm-interact', 'fit_time_mean': '1.607', 'fit_time_std': '0.359', 'test_score_mean': '0.905', 'test_score_std': '0.010'}\n"
     ]
    }
   ],
   "source": [
    "dataset = load_heart_data()\n",
    "result = benchmark_models('heart', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)\n",
    "results.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------------------------------------------------------------------------\n",
      "breast-cancer\n",
      "------------------------------------------------------------------------------\n",
      "{'dataset_name': 'breast-cancer'}\n",
      "\n",
      "{'model_name': 'linear-sgd', 'fit_time_mean': '0.014', 'fit_time_std': '0.003', 'test_score_mean': '0.989', 'test_score_std': '0.003'}\n",
      "{'model_name': 'lr', 'fit_time_mean': '0.016', 'fit_time_std': '0.000', 'test_score_mean': '0.995', 'test_score_std': '0.005'}\n",
      "{'model_name': 'rf-100', 'fit_time_mean': '0.409', 'fit_time_std': '0.011', 'test_score_mean': '0.992', 'test_score_std': '0.009'}\n",
      "{'model_name': 'xgb', 'fit_time_mean': '0.294', 'fit_time_std': '0.087', 'test_score_mean': '0.995', 'test_score_std': '0.006'}\n",
      "{'model_name': 'ebm main', 'fit_time_mean': '1.026', 'fit_time_std': '0.439', 'test_score_mean': '0.995', 'test_score_std': '0.006'}\n",
      "{'model_name': 'ebm-interact', 'fit_time_mean': '192.805', 'fit_time_std': '126.188', 'test_score_mean': '0.995', 'test_score_std': '0.006'}\n"
     ]
    }
   ],
   "source": [
    "dataset = load_breast_data()\n",
    "result = benchmark_models('breast-cancer', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)\n",
    "results.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------------------------------------------------------------------------\n",
      "adult\n",
      "------------------------------------------------------------------------------\n",
      "{'dataset_name': 'adult'}\n",
      "\n",
      "{'model_name': 'linear-sgd', 'fit_time_mean': '0.418', 'fit_time_std': '0.060', 'test_score_mean': '0.841', 'test_score_std': '0.019'}\n",
      "{'model_name': 'lr', 'fit_time_mean': '2.342', 'fit_time_std': '0.177', 'test_score_mean': '0.907', 'test_score_std': '0.003'}\n",
      "{'model_name': 'rf-100', 'fit_time_mean': '2.072', 'fit_time_std': '1.484', 'test_score_mean': '0.903', 'test_score_std': '0.002'}\n",
      "{'model_name': 'xgb', 'fit_time_mean': '15.498', 'fit_time_std': '0.563', 'test_score_mean': '0.922', 'test_score_std': '0.002'}\n",
      "{'model_name': 'ebm main', 'fit_time_mean': '24.705', 'fit_time_std': '3.341', 'test_score_mean': '0.928', 'test_score_std': '0.002'}\n",
      "{'model_name': 'ebm-interact', 'fit_time_mean': '36.416', 'fit_time_std': '0.466', 'test_score_mean': '0.928', 'test_score_std': '0.002'}\n"
     ]
    }
   ],
   "source": [
    "dataset = load_adult_data()\n",
    "result = benchmark_models('adult', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)\n",
    "results.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------------------------------------------------------------------------\n",
      "credit-fraud\n",
      "------------------------------------------------------------------------------\n",
      "{'dataset_name': 'credit-fraud'}\n",
      "\n",
      "{'model_name': 'linear-sgd', 'fit_time_mean': '1.166', 'fit_time_std': '0.086', 'test_score_mean': '0.984', 'test_score_std': '0.002'}\n",
      "{'model_name': 'lr', 'fit_time_mean': '6.145', 'fit_time_std': '0.688', 'test_score_mean': '0.979', 'test_score_std': '0.002'}\n",
      "{'model_name': 'rf-100', 'fit_time_mean': '12.063', 'fit_time_std': '0.053', 'test_score_mean': '0.950', 'test_score_std': '0.007'}\n",
      "{'model_name': 'xgb', 'fit_time_mean': '94.712', 'fit_time_std': '0.766', 'test_score_mean': '0.981', 'test_score_std': '0.003'}\n",
      "{'model_name': 'ebm main', 'fit_time_mean': '99.003', 'fit_time_std': '12.889', 'test_score_mean': '0.975', 'test_score_std': '0.005'}\n",
      "{'model_name': 'ebm-interact', 'fit_time_mean': '435.343', 'fit_time_std': '61.869', 'test_score_mean': '0.978', 'test_score_std': '0.004'}\n"
     ]
    }
   ],
   "source": [
    "dataset = load_credit_data()\n",
    "result = benchmark_models('credit-fraud', dataset['full']['X'], dataset['full']['y'], n_splits=n_splits)\n",
    "results.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "------------------------------------------------------------------------------\n",
      "telco-churn\n",
      "------------------------------------------------------------------------------\n",
      "{'dataset_name': 'telco-churn'}\n",
      "\n",
      "{'model_name': 'linear-sgd', 'fit_time_mean': '2.236', 'fit_time_std': '0.213', 'test_score_mean': '0.798', 'test_score_std': '0.008'}\n",
      "{'model_name': 'lr', 'fit_time_mean': '25.970', 'fit_time_std': '2.931', 'test_score_mean': '0.804', 'test_score_std': '0.015'}\n",
      "{'model_name': 'rf-100', 'fit_time_mean': '3.310', 'fit_time_std': '1.204', 'test_score_mean': '0.824', 'test_score_std': '0.002'}\n",
      "{'model_name': 'xgb', 'fit_time_mean': '140.873', 'fit_time_std': '0.970', 'test_score_mean': '0.850', 'test_score_std': '0.006'}\n",
      "{'model_name': 'ebm main', 'fit_time_mean': '11.451', 'fit_time_std': '1.413', 'test_score_mean': '0.851', 'test_score_std': '0.005'}\n",
      "{'model_name': 'ebm-interact', 'fit_time_mean': '5.363', 'fit_time_std': '0.854', 'test_score_mean': '0.851', 'test_score_std': '0.005'}\n"
     ]
    }
   ],
   "source": [
    "dataset = load_telco_churn_data()\n",
    "result = benchmark_models('telco-churn', dataset['full']['X'], dataset['full']['y'], n_splits=3)\n",
    "results.append(result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "records = [item for result in results for item in result]\n",
    "record_df = pd.DataFrame.from_records(records)[['dataset_name', 'model_name', 'test_score_mean', 'test_score_std']]\n",
    "record_df.to_csv('ebm-perf-classification-overnight.csv')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
